Importo librerías¶

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
import scipy.stats as ss
import warnings

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

Funciones¶

In [2]:
def plot_feature(df, col_name, isContinuous, target):
    """
    Visualize a variable with and without faceting on the loan status.
    - df dataframe
    - col_name is the variable name in the dataframe
    - full_name is the full variable name
    - continuous is True if the variable is continuous, False otherwise
    """
    f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(12,3), dpi=90)
    
    count_null = df[col_name].isnull().sum()
    if isContinuous:
        
        sns.histplot(df.loc[df[col_name].notnull(), col_name], kde=False, ax=ax1)
    else:
        sns.countplot(df, x=col_name, color='#5975A4', saturation=1, ax=ax1)
    ax1.set_xlabel(col_name)
    ax1.set_ylabel('Count')
    ax1.set_title(col_name+ ' Numero de nulos: '+str(count_null))
    plt.xticks(rotation = 90)

    if isContinuous:
        sns.boxplot(x=col_name, y=target, data=df, ax=ax2)
        ax2.set_ylabel('')
        ax2.set_title(col_name + ' by '+target)
    else:
        data = df.groupby(col_name)[target].value_counts(normalize=True).to_frame('proportion').reset_index() 
        data.columns = [i, target, 'proportion']
        #sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
        sns.barplot(x = col_name, y = 'proportion', hue= target, data = data, saturation=1, ax=ax2)
        ax2.set_ylabel(target+' fraction')
        ax2.set_title(target)
        plt.xticks(rotation = 90)
    ax2.set_xlabel(col_name)
    
    plt.tight_layout()
    

def dame_variables_categoricas(dataset=None):
    '''
    ----------------------------------------------------------------------------------------------------------
    Función dame_variables_categoricas:
    ----------------------------------------------------------------------------------------------------------
        -Descripción: Función que recibe un dataset y devuelve una lista con los nombres de las 
        variables categóricas
        -Inputs: 
            -- dataset: Pandas dataframe que contiene los datos
        -Return:
            -- lista_variables_categoricas: lista con los nombres de las variables categóricas del
            dataset de entrada con menos de 100 valores diferentes
            -- 1: la ejecución es incorrecta
    '''
    if dataset is None:
        print(u'\nFaltan argumentos por pasar a la función')
        return 1
    lista_variables_categoricas = []
    other = []
    for i in dataset.columns:
        if (dataset[i].dtype!=float) & (dataset[i].dtype!=int):
            unicos = int(len(np.unique(dataset[i].dropna(axis=0, how='all'))))
            if unicos < 100:
                lista_variables_categoricas.append(i)
            else:
                other.append(i)

    return lista_variables_categoricas, other


def get_corr_matrix(dataset = None, metodo='pearson', size_figure=[10,8]):
    # Para obtener la correlación de Spearman, sólo cambiar el metodo por 'spearman'

    if dataset is None:
        print(u'\nHace falta pasar argumentos a la función')
        return 1
    sns.set(style="white")
    # Compute the correlation matrix
    corr = dataset.corr(method=metodo) 
    # Set self-correlation to zero to avoid distraction
    for i in range(corr.shape[0]):
        corr.iloc[i, i] = 0
    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=size_figure)
    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, center=0,
                square=True, linewidths=.5,  cmap ='viridis' ) #cbar_kws={"shrink": .5}
    plt.show()
    
    return 0

def get_deviation_of_mean_perc(pd_loan, list_var_continuous, target, multiplier):
    """
    Devuelve el porcentaje de valores que exceden del intervalo de confianza
    :type series:
    :param multiplier:
    :return:
    """
    pd_final = pd.DataFrame()
    
    for i in list_var_continuous:
        
        series_mean = pd_loan[i].mean()
        series_std = pd_loan[i].std()
        std_amp = multiplier * series_std
        left = series_mean - std_amp
        right = series_mean + std_amp
        size_s = pd_loan[i].size
        
        perc_goods = pd_loan[i][(pd_loan[i] >= left) & (pd_loan[i] <= right)].size/size_s
        perc_excess = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size/size_s
        
        if perc_excess>0:    
            pd_concat_percent = pd.DataFrame(pd_loan[target][(pd_loan[i] < left) | (pd_loan[i] > right)]\
                                            .value_counts(normalize=True).reset_index()).T
            pd_concat_percent.columns = [pd_concat_percent.iloc[0,0], 
                                         pd_concat_percent.iloc[0,1]]
            pd_concat_percent = pd_concat_percent.drop('fraud_bool',axis=0)
            pd_concat_percent['variable'] = i
            pd_concat_percent['sum_outlier_values'] = pd_loan[i][(pd_loan[i] < left) | (pd_loan[i] > right)].size
            pd_concat_percent['porcentaje_sum_null_values'] = perc_excess
            pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
            
    if pd_final.empty:
        print('No existen variables con valores nulos')
        
    return pd_final


def get_percent_null_values_target(pd_loan, list_var_continuous, target):
    pd_final = pd.DataFrame()

    for i in list_var_continuous:
        if i in ['prev_address_months_count', 'current_address_months_count', 'bank_months_count',
                 'session_length_in_minutes', 'device_distinct_emails_8w']:
            if (pd_loan[i] == -1).sum() > 0:
                pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i] == -1] \
                                                 .value_counts(normalize=True).reset_index()).T
                pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0],
                                             pd_concat_percent.iloc[0, 1]]
                pd_concat_percent = pd_concat_percent.drop(target, axis=0)
                pd_concat_percent['variable'] = i
                pd_concat_percent['sum_null_values'] = (pd_loan[i] == -1).sum()
                pd_concat_percent['porcentaje_sum_null_values'] = (pd_loan[i] == -1).sum() / pd_loan.shape[0]
                pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
        elif i == 'intended_balcon_amount':
            if (pd_loan[i] < 0).sum() > 0:
                pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i] < 0] \
                                                 .value_counts(normalize=True).reset_index()).T
                pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0],
                                             pd_concat_percent.iloc[0, 1]]
                pd_concat_percent = pd_concat_percent.drop(target, axis=0)
                pd_concat_percent['variable'] = i
                pd_concat_percent['sum_null_values'] = (pd_loan[i] < 0).sum()
                pd_concat_percent['porcentaje_sum_null_values'] = (pd_loan[i] < 0).sum() / pd_loan.shape[0]
                pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)
        else:
            if pd_loan[i].isnull().sum() > 0:
                pd_concat_percent = pd.DataFrame(pd_loan[target][pd_loan[i].isnull()] \
                                                 .value_counts(normalize=True).reset_index()).T
                pd_concat_percent.columns = [pd_concat_percent.iloc[0, 0],
                                             pd_concat_percent.iloc[0, 1]]
                pd_concat_percent = pd_concat_percent.drop(target, axis=0)
                pd_concat_percent['variable'] = i
                pd_concat_percent['sum_null_values'] = pd_loan[i].isnull().sum()
                pd_concat_percent['porcentaje_sum_null_values'] = pd_loan[i].isnull().sum() / pd_loan.shape[0]
                pd_final = pd.concat([pd_final, pd_concat_percent], axis=0).reset_index(drop=True)

    if pd_final.empty:
        print('No existen variables con valores nulos')

    return pd_final



def cramers_v(confusion_matrix):
    """ 
    calculate Cramers V statistic for categorial-categorial association.
    uses correction from Bergsma and Wicher,
    Journal of the Korean Statistical Society 42 (2013): 323-328
    
    confusion_matrix: tabla creada con pd.crosstab()
    
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

Cargamos los datos¶

Carga de los datos del preprocesamiento y agrupamiento de los distintos tipos de variables en varias listas (categóricas, enteros, continuos, numéricos y todos las variables juntas)

In [3]:
pd_fraud = pd.read_csv('./data_preprocessing/pd_data_initial_preprocessing.csv')
In [4]:
pd_fraud = pd_fraud.drop(columns='Unnamed: 0')
pd_fraud
Out[4]:
fraud_bool income name_email_similarity prev_address_months_count current_address_months_count customer_age days_since_request intended_balcon_amount payment_type zip_count_4w velocity_6h velocity_24h velocity_4w bank_branch_count_8w date_of_birth_distinct_emails_4w employment_status credit_risk_score email_is_free housing_status phone_home_valid phone_mobile_valid bank_months_count has_other_cards proposed_credit_limit foreign_request source session_length_in_minutes device_os keep_alive_session device_distinct_emails_8w device_fraud_count month
0 1 0.9 0.166828 -1 88 50 0.020925 -1.331345 AA 769 10650.765523 3134.319630 3863.647740 1 6 CA 185 0 BA 1 0 24 0 500.0 0 INTERNET 3.888115 windows 0 1 0 7
1 1 0.9 0.296286 -1 144 50 0.005418 -0.816224 AB 366 534.047319 2670.918292 3124.298166 718 3 CA 259 1 BA 0 0 15 0 1500.0 0 INTERNET 31.798819 windows 0 1 0 7
2 1 0.9 0.044985 -1 132 40 3.108549 -0.755728 AC 870 4048.534263 2893.621498 3159.590679 1 14 CB 177 1 BA 0 1 -1 0 200.0 0 INTERNET 4.728705 other 0 1 0 7
3 1 0.9 0.159511 -1 22 50 0.019079 -1.205124 AB 810 3457.064063 4054.908412 3022.261812 1921 6 CA 110 1 BA 0 1 31 1 200.0 0 INTERNET 2.047904 linux 0 1 0 7
4 1 0.9 0.596414 -1 218 50 0.004441 -0.773276 AB 890 5020.341679 2728.237159 3087.670952 1990 2 CA 295 1 BA 1 0 31 0 1500.0 0 INTERNET 3.775225 macintosh 1 1 0 7
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
999995 0 0.6 0.192631 -1 104 40 0.030592 -1.044454 AB 804 7905.711839 8341.468557 4972.635997 1 8 CA 75 1 BC 1 1 25 0 200.0 0 INTERNET 8.511502 linux 1 1 0 4
999996 0 0.8 0.322989 148 9 50 1.628119 -1.409803 AC 3306 5391.470463 4955.170808 5022.728108 0 2 CC 154 1 BC 1 1 -1 0 200.0 0 INTERNET 8.967865 windows 0 1 0 4
999997 0 0.8 0.879403 -1 30 20 0.018563 34.692760 AA 1522 8063.102636 5670.654316 4377.196321 2023 6 CF 64 0 BC 0 1 11 0 200.0 0 INTERNET 8.195531 other 0 1 0 4
999998 0 0.9 0.762112 -1 189 20 0.015352 94.661055 AA 1418 8092.641762 3982.582204 4394.803296 1678 6 CA 163 0 BA 1 0 28 0 500.0 0 INTERNET 4.336064 windows 1 1 0 4
999999 0 0.2 0.697452 -1 321 20 2.655916 9.908499 AA 951 6169.630036 3695.308261 4352.334543 2 12 CA 36 1 BE 0 1 15 0 200.0 0 INTERNET 6.717022 linux 0 1 0 4

1000000 rows × 32 columns

Guardamos en una lista aquellas variables que son categóricas

In [5]:
lista_variables_categoricas = [
    'payment_type',
    'employment_status',
    'email_is_free',
    'housing_status',
    'phone_home_valid',
    'phone_mobile_valid',
    'has_other_cards',
    'foreign_request',
    'source',
    'device_os',
    'keep_alive_session',
    'fraud_bool',
    'month'
]

Con un bucle cambiamos el type de esas variables a 'category'

In [6]:
for i in lista_variables_categoricas:
    pd_fraud[i] = pd_fraud[i].astype('category')
In [7]:
pd_fraud.dtypes
Out[7]:
fraud_bool                          category
income                               float64
name_email_similarity                float64
prev_address_months_count              int64
current_address_months_count           int64
customer_age                           int64
days_since_request                   float64
intended_balcon_amount               float64
payment_type                        category
zip_count_4w                           int64
velocity_6h                          float64
velocity_24h                         float64
velocity_4w                          float64
bank_branch_count_8w                   int64
date_of_birth_distinct_emails_4w       int64
employment_status                   category
credit_risk_score                      int64
email_is_free                       category
housing_status                      category
phone_home_valid                    category
phone_mobile_valid                  category
bank_months_count                      int64
has_other_cards                     category
proposed_credit_limit                float64
foreign_request                     category
source                              category
session_length_in_minutes            float64
device_os                           category
keep_alive_session                  category
device_distinct_emails_8w              int64
device_fraud_count                     int64
month                               category
dtype: object

Guardamos todas las variables que son numéricas

In [8]:
todas_las_variables = pd_fraud.columns

lista_variables_numericas = []
for variable in todas_las_variables:
    if variable not in lista_variables_categoricas:
        lista_variables_numericas.append(variable)
lista_variables_numericas
Out[8]:
['income',
 'name_email_similarity',
 'prev_address_months_count',
 'current_address_months_count',
 'customer_age',
 'days_since_request',
 'intended_balcon_amount',
 'zip_count_4w',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'bank_branch_count_8w',
 'date_of_birth_distinct_emails_4w',
 'credit_risk_score',
 'bank_months_count',
 'proposed_credit_limit',
 'session_length_in_minutes',
 'device_distinct_emails_8w',
 'device_fraud_count']

Separación en train y test estratificado¶

Visualizamos la proporción de la variable objetivo en el dataset

In [9]:
pd_fraude_bool = pd_fraud['fraud_bool'].value_counts(normalize = True).mul(100).reset_index()
pd_fraude_bool['conteo'] = pd_fraud['fraud_bool'].value_counts()

pd_fraude_bool
Out[9]:
fraud_bool proportion conteo
0 0 98.8971 988971
1 1 1.1029 11029

La variable objetivo representa en un 98.9% que no se cometió fraude y en un 1.1% que sí se cometió

In [10]:
fig = px.histogram(pd_fraude_bool, x="fraud_bool", y=['proportion'], title = 'Proporción de fraudes',
                   labels = {"fraud_bool": "fraude"})
fig.show()

Separamos los datos en dos conjuntos en los que se mantenga una proporción muy similar de la variable objetivo

In [11]:
X_pd_fraud, X_pd_fraud_test, y_pd_fraud, y_pd_fraud_test = train_test_split(pd_fraud.drop('fraud_bool',axis=1), 
                                                                     pd_fraud['fraud_bool'], 
                                                                     stratify=pd_fraud['fraud_bool'], 
                                                                     test_size=0.2)
pd_fraud_train = pd.concat([X_pd_fraud, y_pd_fraud],axis=1)
pd_fraud_test = pd.concat([X_pd_fraud_test, y_pd_fraud_test],axis=1)
In [12]:
print('== Train\n', pd_fraud_train['fraud_bool'].value_counts(normalize=True))
print('== Test\n', pd_fraud_test['fraud_bool'].value_counts(normalize=True))
== Train
 fraud_bool
0    0.988971
1    0.011029
Name: proportion, dtype: float64
== Test
 fraud_bool
0    0.98897
1    0.01103
Name: proportion, dtype: float64

Visualización descriptiva de los datos¶

In [13]:
lista_nulos_dif = ['prev_address_months_count', 'current_address_months_count', 'bank_months_count',\
                   'session_length_in_minutes', 'device_distinct_emails_8w', 'intended_balcon_amount']

pd_series_null_columns = pd_fraud_train\
.drop(lista_nulos_dif, axis=1)\
.isnull().sum().sort_values(ascending=False)

# ##

lista_nulos_neg_1 = ['prev_address_months_count', 'current_address_months_count', 'bank_months_count',\
                  'session_length_in_minutes', 'device_distinct_emails_8w']

pd_series_null_columns2 = (pd_fraud_train[lista_nulos_neg_1] == -1)\
.sum().sort_values(ascending=False)

# ##

pd_series_null_columns3 = (pd_fraud_train['intended_balcon_amount'] < 0).sum()
pd_series_null_columns3 = pd.Series(pd_series_null_columns3, index = ['intended_balcon_amount'])


# ##

pd_null_columnas_final = pd.concat((pd_series_null_columns, pd_series_null_columns2, pd_series_null_columns3))\
.sort_values(ascending = False)
In [14]:
pd_null_columnas = pd.DataFrame(pd_null_columnas_final, columns=['nulos_columnas']) 
pd_null_columnas['porcentaje_columnas'] = pd_null_columnas['nulos_columnas']/pd_fraud_train.shape[0]
pd_null_columnas
Out[14]:
nulos_columnas porcentaje_columnas
intended_balcon_amount 594311 0.742889
prev_address_months_count 570079 0.712599
bank_months_count 203164 0.253955
current_address_months_count 3394 0.004242
session_length_in_minutes 1618 0.002022
device_distinct_emails_8w 293 0.000366
name_email_similarity 0 0.000000
fraud_bool 0 0.000000
customer_age 0 0.000000
days_since_request 0 0.000000
payment_type 0 0.000000
zip_count_4w 0 0.000000
velocity_6h 0 0.000000
velocity_24h 0 0.000000
velocity_4w 0 0.000000
bank_branch_count_8w 0 0.000000
income 0 0.000000
employment_status 0 0.000000
credit_risk_score 0 0.000000
email_is_free 0 0.000000
housing_status 0 0.000000
phone_home_valid 0 0.000000
phone_mobile_valid 0 0.000000
has_other_cards 0 0.000000
proposed_credit_limit 0 0.000000
foreign_request 0 0.000000
source 0 0.000000
device_os 0 0.000000
keep_alive_session 0 0.000000
device_fraud_count 0 0.000000
month 0 0.000000
date_of_birth_distinct_emails_4w 0 0.000000

Observamos la cantidad de valores missing en las variables del dataset de entrenamiento. Vemos que las variables 'intended_balcon_amount', 'prev_address_months_count' y 'bank_months_count' poseen una gran cantidad de valores missing, con un 74.3, 71.3 y 25.4% respectivamente de toda la muestra

In [15]:
pd_fraud_train['prev_address_months_count'] = pd_fraud_train['prev_address_months_count'].astype(float)
pd_fraud_train['current_address_months_count'] = pd_fraud_train['current_address_months_count'].astype(float)
pd_fraud_train['zip_count_4w'] = pd_fraud_train['zip_count_4w'].astype(float)
pd_fraud_train['bank_branch_count_8w'] = pd_fraud_train['bank_branch_count_8w'].astype(float)
pd_fraud_train['credit_risk_score'] = pd_fraud_train['credit_risk_score'].astype(float)

Hacemos las transformación de estas variables para que las gráficas se vean mejor, después las volvemos a convertir

In [16]:
warnings.filterwarnings('ignore')
target = 'fraud_bool'
for i in todas_las_variables: #list_var_numerica:
    print(i)
    
    #plot_feature(pd_fraud_train.head(1000),
            #col_name = i, isContinuous = True, target = target)
    if (pd_fraud_train[i].dtype==float) & (i!='fraud_bool'):
        plot_feature(pd_fraud_train, col_name=i, isContinuous=True, target='fraud_bool')
    elif  i!='fraud_bool':
        plot_feature(pd_fraud_train, col_name=i, isContinuous=False, target='fraud_bool')
fraud_bool
income
name_email_similarity
prev_address_months_count
current_address_months_count
customer_age
days_since_request
intended_balcon_amount
payment_type
zip_count_4w
velocity_6h
velocity_24h
velocity_4w
bank_branch_count_8w
date_of_birth_distinct_emails_4w
employment_status
credit_risk_score
email_is_free
housing_status
phone_home_valid
phone_mobile_valid
bank_months_count
has_other_cards
proposed_credit_limit
foreign_request
source
session_length_in_minutes
device_os
keep_alive_session
device_distinct_emails_8w
device_fraud_count
month
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

He intentado configurar los datos y la función de varias formas ya que parece que no se visualizan muy correctamente las distintas variables y me he quedado con esta configuración que es en la que mejor se ve en general.

En los gráficos se puede ver que se mantiene aproximadamente la misma proporción de fraudes cometidos y no cometidos a lo largo de las distintas variables. Los casos más interesantes pueden ser en la variable 'bank_months_count' donde existe un pico en la proporción de fraudes cometidos en el valor 17, y también en la variable 'customer_age', donde va aumentando la proporción de fraudes cometidos a medida que la edad es mayor.

In [17]:
pd_fraud_train['prev_address_months_count'] = pd_fraud_train['prev_address_months_count'].astype(int)
pd_fraud_train['current_address_months_count'] = pd_fraud_train['current_address_months_count'].astype(int)
pd_fraud_train['zip_count_4w'] = pd_fraud_train['zip_count_4w'].astype(int)
pd_fraud_train['bank_branch_count_8w'] = pd_fraud_train['bank_branch_count_8w'].astype(int)
pd_fraud_train['credit_risk_score'] = pd_fraud_train['credit_risk_score'].astype(int)

Tratamiento de las variables continuas¶

In [18]:
lista_variables_continuas = []
for variable in pd_fraud.columns:
    if (pd_fraud[variable].dtypes == 'float'):
        lista_variables_continuas.append(variable)
lista_variables_continuas
Out[18]:
['income',
 'name_email_similarity',
 'days_since_request',
 'intended_balcon_amount',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'proposed_credit_limit',
 'session_length_in_minutes']

Tratamiento de outliers¶

In [19]:
get_deviation_of_mean_perc(pd_fraud_train, lista_variables_continuas, target='fraud_bool', multiplier=3)
Out[19]:
0.0 1.0 variable sum_outlier_values porcentaje_sum_null_values
0 0.987643 0.012357 days_since_request 14162 0.017702
1 0.990258 0.009742 intended_balcon_amount 15089 0.018861
2 0.992770 0.007230 velocity_6h 3458 0.004322
3 0.997585 0.002415 velocity_24h 414 0.000517
4 0.871431 0.128569 proposed_credit_limit 4939 0.006174
5 0.979379 0.020621 session_length_in_minutes 18816 0.023520

En la variable 'proposed_credit_limit' vemos que tiene un mayor porcentaje de fraude cometido, con un 13%. Después del análisis, y como no hay demasiados outliers, he decidido dejarlos sin sustituir.

Correlaciones¶

In [20]:
get_corr_matrix(dataset = pd_fraud_train[lista_variables_continuas], 
                metodo='pearson', size_figure=[10,8])
No description has been provided for this image
Out[20]:
0
In [21]:
corr = pd_fraud_train[lista_variables_continuas].corr('pearson')
new_corr = corr.abs()
new_corr.loc[:,:] = np.tril(new_corr, k=-1) # below main lower triangle of an array
new_corr = new_corr.stack().to_frame('correlation').reset_index().sort_values(by='correlation', ascending=False)
new_corr[new_corr['correlation']>0.4]
Out[21]:
level_0 level_1 correlation
59 velocity_4w velocity_24h 0.539547
49 velocity_24h velocity_6h 0.464186
58 velocity_4w velocity_6h 0.400398

Se ve claramente en el gráfico y la tabla que las variables con mayor correlación son: velocity_4w con velocity_24h, velocity_24h con velocity_6h y velocity_6h con velocity_4w.

Aquí determino que no influye la colinealidad de estas variables en los resultados del análisis por lo que decido no eliminarlas

Tratamiento de valores nulos¶

In [22]:
get_percent_null_values_target(pd_fraud_train, lista_variables_continuas, target='fraud_bool')
Out[22]:
0.0 1.0 variable sum_null_values porcentaje_sum_null_values
0 0.986874 0.013126 intended_balcon_amount 594311 0.742889
1 0.993820 0.006180 session_length_in_minutes 1618 0.002022

En este caso, la variable 'intended_balcon_amount' presenta un gran número de valores nulos en dataframe de entrenamiento, por lo que eliminar las filas donde esta variable sea nula está descartado.

Este algoritmo parece que no se ve muy afectado por la presencia de valores nulos en dicha variabe, por lo que la opción que se toma aquí es la de aceptar esos valores en el dataframe.

Tratamiento de las variables categóricas¶

Para calcular la correlación de spearman, primero tengo que convertir las variables categóricas a numéricas

In [23]:
lista_variables_categoricas
Out[23]:
['payment_type',
 'employment_status',
 'email_is_free',
 'housing_status',
 'phone_home_valid',
 'phone_mobile_valid',
 'has_other_cards',
 'foreign_request',
 'source',
 'device_os',
 'keep_alive_session',
 'fraud_bool',
 'month']
In [24]:
pd_fraud_train[lista_variables_categoricas].select_dtypes(int).columns
Out[24]:
Index([], dtype='object')
In [25]:
confusion_matrix = pd.crosstab(pd_fraud_train["keep_alive_session"], pd_fraud_train["fraud_bool"])
print(confusion_matrix)
cramers_v(confusion_matrix.values)
fraud_bool               0     1
keep_alive_session              
0                   332634  5820
1                   458543  3003
Out[25]:
0.050542586112449245

Después de probar entre diferentes variables, la mayor correlación con la variable objetivo la tiene 'keep_alive_session'

Tratamiento de valores nulos¶

En este caso, las variables categóricas no presentan valores nulos, por lo que no se tomará ninguna medida.

Guardado de los dataframe¶

In [26]:
pd_fraud_train.to_csv('./data_preprocessing/train_pd_data_preprocessing_missing_outlier.csv')
pd_fraud_test.to_csv('./data_preprocessing/test_pd_data_preprocessing_missing_outlier.csv')